data <- read_csv("Manhattan_Restaurant_Inspection_Results.csv") %>%
clean_names()
if (!all(c("grade", "inspection_date") %in% colnames(data))) {
stop("The required columns 'grade' and 'inspection_date' are not found in the dataset.")
}
time_data <- data %>%
filter(!is.na(grade), !is.na(inspection_date)) %>%
mutate(year = as.numeric(format(as.Date(inspection_date, format = "%m/%d/%Y"), "%Y")))
grade_trends_borough <- time_data %>%
group_by(year, boro, grade) %>%
summarise(count = n(), .groups = "drop")
viridis_colors <- viridis(n = length(unique(grade_trends_borough$grade)))
plot_borough <- plot_ly(
grade_trends_borough,
x = ~year,
y = ~count,
color = ~grade,
colors = viridis_colors,
type = "scatter",
mode = "lines+markers",
split = ~grade
) %>%
layout(
title = list(text = "Grade Distribution Over Time by Borough"),
xaxis = list(title = "Year"),
yaxis = list(title = "Number of Restaurants"),
legend = list(title = list(text = "Grade"), x = 1.1),
plot_bgcolor = 'rgba(0, 0, 0, 0)',
paper_bgcolor = 'rgba(0, 0, 0, 0)'
)
grade_trends_cuisine <- time_data %>%
filter(cuisine_description == "Italian") %>%
group_by(year, cuisine_description, grade) %>%
summarise(count = n(), .groups = "drop")
plot_cuisine <- plot_ly(
grade_trends_cuisine,
x = ~year,
y = ~count,
color = ~grade,
colors = viridis_colors,
type = "scatter",
mode = "lines+markers",
split = ~grade
) %>%
layout(
title = list(text = "Grade Distribution Over Time for Italian Restaurants"),
xaxis = list(title = "Year"),
yaxis = list(title = "Number of Restaurants"),
legend = list(title = list(text = "Grade"), x = 1.1),
plot_bgcolor = 'rgba(0, 0, 0, 0)',
paper_bgcolor = 'rgba(0, 0, 0, 0)'
)
plot_borough
plot_cuisine
I use the plotly package to generate two interactive scatter plots to visualize the trend of restaurant levels over time.
These two charts respectively show the trend of the horizontal distribution of administrative regions over time. Grade Trends of Italian Restaurants: Pay Attention to the Time Grade Trends of Italian Cuisine. I chose Italian restaurants as a case study to showcase to you because of our personal love for them.
And I have customized interactive drawing with hover details, zooming, and switching functions. And it uses a transparent background and Viridis color palette, which is more beautiful.
The generated graphics are distributed in levels over time according to each region:
X-axis: Check the year. Y-axis: Obtain the number of restaurants for each level. Use case: Identify trends in administrative regions, such as the improvement of grades over time.
Grade distribution of Italian restaurants over time:
X-axis: Check the year. Y-axis: Number of Italian restaurants per level. Use case: Check if Italian restaurants maintain a higher level.
manhattan_data <- read_csv("Manhattan_Restaurant_Inspection_Results.csv", na = c("NA", "", "."))
cleaned_data <- manhattan_data %>%
janitor::clean_names() %>%
filter(
!is.na(dba),
!is.na(cuisine_description),
!is.na(grade),
!is.na(zipcode),
grade %in% c("A", "B", "C") # Keep only grades A, B, C
) %>%
mutate(
grade_numeric = case_when(
grade == "A" ~ 1,
grade == "B" ~ 2,
grade == "C" ~ 3
),
cuisine_description = factor(cuisine_description),
inspection_type = factor(inspection_type),
violation_code = str_trim(as.character(violation_code)),
location = case_when( # Define location as a categorical variable
zipcode >= 10000 & zipcode <= 10025 ~ "Downtown",
zipcode >= 10026 & zipcode <= 10040 ~ "Midtown",
zipcode >= 10041 & zipcode <= 10282 ~ "Uptown",
TRUE ~ "Other"
)
) %>%
mutate(location = factor(location)) # Convert location to factor
cleaned_data <- cleaned_data %>%
filter(grepl("^[A-Za-z0-9]+$", violation_code)) # Keep only alphanumeric values
predictors <- c("cuisine_description", "inspection_type", "violation_code", "location")
univariable_results <- lapply(predictors, function(predictor) {
formula <- as.formula(paste("grade_numeric ~", predictor))
model <- glm(formula, data = cleaned_data, family = quasipoisson) # Treat as ordinal outcome
tidy(model) %>%
mutate(predictor = predictor) %>%
select(predictor, term, estimate, std.error, p.value)
})
results_table <- bind_rows(univariable_results) %>%
filter(term != "(Intercept)") # Remove the intercept for clarity
# Show only the first 10 rows of the results table
kable(
head(results_table, 10), # Display the first 10 rows
format = "markdown",
col.names = c("Predictor", "Term", "Estimate", "Std. Error", "p-value"),
caption = "Top 10 Univariable Ordinal Regression Results"
)
| Predictor | Term | Estimate | Std. Error | p-value |
|---|---|---|---|---|
| cuisine_description | cuisine_descriptionAfrican | 0.7855205 | 0.1393366 | 0.0000000 |
| cuisine_description | cuisine_descriptionAmerican | 0.1848587 | 0.1348718 | 0.1704997 |
| cuisine_description | cuisine_descriptionArmenian | 0.4353181 | 0.1820141 | 0.0167765 |
| cuisine_description | cuisine_descriptionAsian/Asian Fusion | 0.3176720 | 0.1357685 | 0.0192987 |
| cuisine_description | cuisine_descriptionAustralian | 0.3456047 | 0.1407816 | 0.0140967 |
| cuisine_description | cuisine_descriptionBagels/Pretzels | 0.2147958 | 0.1366366 | 0.1159531 |
| cuisine_description | cuisine_descriptionBakery Products/Desserts | 0.2631724 | 0.1353316 | 0.0518243 |
| cuisine_description | cuisine_descriptionBangladeshi | 0.5108256 | 0.1757424 | 0.0036550 |
| cuisine_description | cuisine_descriptionBarbecue | 0.2455706 | 0.1421938 | 0.0841730 |
| cuisine_description | cuisine_descriptionBasque | 0.0000000 | 0.3812387 | 1.0000000 |
model_example <- glm(grade_numeric ~ cuisine_description, data = cleaned_data, family = quasipoisson)
residuals_data <- augment(model_example)
# QQ plot
ggplot(residuals_data, aes(sample = .resid)) +
stat_qq() +
stat_qq_line(color = "blue") +
labs(
title = "QQ Plot of Residuals",
x = "Theoretical Quantiles",
y = "Sample Quantiles"
) +
theme_minimal()
In order to obtain this model, we first load the dataset
Manhathathattan_Tetaurant_Inspection-Results.cv and filter out
irrelevant or missing data. And perform conversion: Grade_ numeric:
Convert grades (A, B, C) into numerical values (1, 2, 3) for regression
analysis. Location: The postal code is divided into four areas: city
center, central city, upper city, and others. Violation-code: Cleaned up
to ensure it only contains alphanumeric values, removing unexpected
entries such as dates or special characters. And in order to make the
visualization more visually appealing, I chose to only display the first
ten lines of content.
Go Home